05 - Facies Classifier

George Crowther

This is an extension / amalgamation of prior entries. The workflow remains not dissimilar to those completed previously, this is:

  • Load and set strings to integers
  • Cursory data examination, this workbook does not attempt to detail the full data analysis
  • Group data by well and brute force feature creation
    • Feature creation focuses on bringing results from adjacent samples into features
    • Look at some ratios between features
  • Used TPOT to train a classifier (exported_pipeline)
  • Feature creation and extraction on test dataset
  • Result prediction

In [1]:
import pandas as pd
import bokeh.plotting as bk
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor

import sys
sys.path.append('~/home/slygeorge/Documents/Python/SEG ML Competition')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

bk.output_notebook()


Loading BokehJS ...

In [2]:
# Input file paths
train_path = '../training_data.csv'

# Read training data to dataframe
train = pd.read_csv(train_path)

# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)

well_names = train['Well Name']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']

In [3]:
train.head()


Out[3]:
class Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 3 A1 SH SHRIMPLIN 2793.0 77.45 0.664 9.9 11.915 4.6 1 1.000
1 3 A1 SH SHRIMPLIN 2793.5 78.26 0.661 14.2 12.565 4.1 1 0.979
2 3 A1 SH SHRIMPLIN 2794.0 79.05 0.658 14.8 13.050 3.6 1 0.957
3 3 A1 SH SHRIMPLIN 2794.5 86.10 0.655 13.9 13.115 3.5 1 0.936
4 3 A1 SH SHRIMPLIN 2795.0 74.58 0.647 13.5 13.300 3.4 1 0.915

In [4]:
train.dropna().describe()


Out[4]:
class Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
count 3232.000000 3232.000000 3232.000000 3232.000000 3232.000000 3232.000000 3232.000000 3232.000000 3232.000000
mean 4.422030 2875.824567 66.135769 0.642719 3.559642 13.483213 3.725014 1.498453 0.520287
std 2.504243 131.006274 30.854826 0.241845 5.228948 7.698980 0.896152 0.500075 0.286792
min 1.000000 2573.500000 13.250000 -0.025949 -21.832000 0.550000 0.200000 1.000000 0.010000
25% 2.000000 2791.000000 46.918750 0.492750 1.163750 8.346750 3.100000 1.000000 0.273000
50% 4.000000 2893.500000 65.721500 0.624437 3.500000 12.150000 3.551500 1.000000 0.526000
75% 6.000000 2980.000000 79.626250 0.812735 6.432500 16.453750 4.300000 2.000000 0.767250
max 9.000000 3122.500000 361.150000 1.480000 18.600000 84.400000 8.094000 2.000000 1.000000

In [5]:
# Some quick-look plots, PE has been highlighted, as this appears to be missing from the alternative version of the training dataset
plots = []
for well, group in train.groupby('Well Name'):
    group = group.sort_values(by = 'Depth')
    plots.append(bk.figure(height = 500, width = 150))
    plots[-1].line(group['PE'], group['Depth'], color = 'blue')
    plots[-1].line(group['DeltaPHI'], group['Depth'], color = 'red')
    plots[-1].title.text = well
    
grid = bk.gridplot([plots])
bk.show(grid)



In [6]:
# Set string features to integers

for i, value in enumerate(train['Formation'].unique()):
    train.loc[train['Formation'] == value, 'Formation'] = i
    
for i, value in enumerate(train['Well Name'].unique()):
    train.loc[train['Well Name'] == value, 'Well Name'] = i

In [7]:
# Used to reassign index, initally after attempting to upsample results

train['orig_index'] = train.index

In [8]:
# Define resample factors
resample_factors = [2, 5, 10, 25, 50]

initial_columns = ['Formation', 'Well Name', 'Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
div_columns = ['Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']

In [9]:
# Use rolling windows through upsampled frame, grouping by well name.

# Empty list to hold frames
mean_frames = []
above = []
below = []

for well, group in train.groupby('Well Name'):
    # Empty list to hold rolling frames
    constructor_list = []
    for f in resample_factors:
        
        working_frame = group[['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M',
       'RELPOS']]
        
        mean_frame = working_frame.rolling(window = f, center = True).mean().interpolate(method = 'index', limit_direction = 'both', limit = None)
        mean_frame.columns = ['Mean_{0}_{1}'.format(f, column) for column in mean_frame.columns]
        max_frame = working_frame.rolling(window = f, center = True).max().interpolate(method = 'index', limit_direction = 'both', limit = None)
        max_frame.columns = ['Max_{0}_{1}'.format(f, column) for column in max_frame.columns]
        min_frame = working_frame.rolling(window = f, center = True).min().interpolate(method = 'index', limit_direction = 'both', limit = None)
        min_frame.columns = ['Min_{0}_{1}'.format(f, column) for column in min_frame.columns]
        std_frame = working_frame.rolling(window = f, center = True).std().interpolate(method = 'index', limit_direction = 'both', limit = None)
        std_frame.columns = ['Std_{0}_{1}'.format(f, column) for column in std_frame.columns]
        var_frame = working_frame.rolling(window = f, center = True).var().interpolate(method = 'index', limit_direction = 'both', limit = None)
        var_frame.columns = ['Var_{0}_{1}'.format(f, column) for column in var_frame.columns]
        diff_frame = working_frame.diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None)
        diff_frame.columns = ['Diff_{0}_{1}'.format(f, column) for column in diff_frame.columns]
        rdiff_frame = working_frame.sort_index(ascending = False).diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None).sort_index()
        rdiff_frame.columns = ['Rdiff_{0}_{1}'.format(f, column) for column in rdiff_frame.columns]
        skew_frame = working_frame.rolling(window = f, center = True).skew().interpolate(method = 'index', limit_direction = 'both', limit = None)
        skew_frame.columns = ['Skew_{0}_{1}'.format(f, column) for column in skew_frame.columns]
        
        f_frame = pd.concat((mean_frame, max_frame, min_frame, std_frame, var_frame, diff_frame, rdiff_frame), axis = 1)
        
        constructor_list.append(f_frame)
        
    well_frame = pd.concat(constructor_list, axis = 1)
    well_frame['class'] = group['class']
    well_frame['Well Name'] = well
    # orig index is holding the original index locations, to make extracting the results trivial
    well_frame['orig_index'] = group['orig_index']
    df = group.sort_values('Depth')
    u = df.shift(-1).fillna(method = 'ffill')
    b = df.shift(1).fillna(method = 'bfill')
    above.append(u[div_columns])
    below.append(b[div_columns])
    
    mean_frames.append(well_frame.fillna(method = 'bfill').fillna(method = 'ffill'))

In [10]:
# Concatenate all sub-frames together into single 'upsampled_frane'
frame = train
frame.index = frame['orig_index']
frame.drop(['orig_index', 'class', 'Well Name'], axis = 1, inplace = True)

for f in mean_frames:
    f.index = f['orig_index']

rolling_frame = pd.concat(mean_frames, axis = 0)
above_frame = pd.concat(above)
above_frame.columns = ['above_'+ column for column in above_frame.columns]
below_frame = pd.concat(below)
below_frame.columns = ['below_'+ column for column in below_frame.columns]
upsampled_frame = pd.concat((frame, rolling_frame, above_frame, below_frame), axis = 1)

In [11]:
# Features is the column set used for training the model
features = [feature for feature in upsampled_frame.columns if 'class' not in feature]

In [12]:
# Normalise dataset
std_scaler = preprocessing.StandardScaler().fit(upsampled_frame[features])

train_std = std_scaler.transform(upsampled_frame[features])

train_std_frame = upsampled_frame
for i, column in enumerate(features):
    train_std_frame.loc[:, column] = train_std[:, i]

upsampled_frame_std = train_std_frame

In [13]:
# Create ratios between features
div_columns = ['Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']

for feature in div_columns:
    for f in div_columns:
        if f == feature:
            continue
        upsampled_frame['{0}_{1}'.format(feature, f)] = upsampled_frame[f] / upsampled_frame[feature]

In [14]:
features = []
[features.append(column) for column in upsampled_frame.columns if 'class' not in column]
print(features)


['Formation', 'Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS', 'Mean_2_Depth', 'Mean_2_GR', 'Mean_2_ILD_log10', 'Mean_2_DeltaPHI', 'Mean_2_PHIND', 'Mean_2_PE', 'Mean_2_NM_M', 'Mean_2_RELPOS', 'Max_2_Depth', 'Max_2_GR', 'Max_2_ILD_log10', 'Max_2_DeltaPHI', 'Max_2_PHIND', 'Max_2_PE', 'Max_2_NM_M', 'Max_2_RELPOS', 'Min_2_Depth', 'Min_2_GR', 'Min_2_ILD_log10', 'Min_2_DeltaPHI', 'Min_2_PHIND', 'Min_2_PE', 'Min_2_NM_M', 'Min_2_RELPOS', 'Std_2_Depth', 'Std_2_GR', 'Std_2_ILD_log10', 'Std_2_DeltaPHI', 'Std_2_PHIND', 'Std_2_PE', 'Std_2_NM_M', 'Std_2_RELPOS', 'Var_2_Depth', 'Var_2_GR', 'Var_2_ILD_log10', 'Var_2_DeltaPHI', 'Var_2_PHIND', 'Var_2_PE', 'Var_2_NM_M', 'Var_2_RELPOS', 'Diff_2_Depth', 'Diff_2_GR', 'Diff_2_ILD_log10', 'Diff_2_DeltaPHI', 'Diff_2_PHIND', 'Diff_2_PE', 'Diff_2_NM_M', 'Diff_2_RELPOS', 'Rdiff_2_Depth', 'Rdiff_2_GR', 'Rdiff_2_ILD_log10', 'Rdiff_2_DeltaPHI', 'Rdiff_2_PHIND', 'Rdiff_2_PE', 'Rdiff_2_NM_M', 'Rdiff_2_RELPOS', 'Mean_5_Depth', 'Mean_5_GR', 'Mean_5_ILD_log10', 'Mean_5_DeltaPHI', 'Mean_5_PHIND', 'Mean_5_PE', 'Mean_5_NM_M', 'Mean_5_RELPOS', 'Max_5_Depth', 'Max_5_GR', 'Max_5_ILD_log10', 'Max_5_DeltaPHI', 'Max_5_PHIND', 'Max_5_PE', 'Max_5_NM_M', 'Max_5_RELPOS', 'Min_5_Depth', 'Min_5_GR', 'Min_5_ILD_log10', 'Min_5_DeltaPHI', 'Min_5_PHIND', 'Min_5_PE', 'Min_5_NM_M', 'Min_5_RELPOS', 'Std_5_Depth', 'Std_5_GR', 'Std_5_ILD_log10', 'Std_5_DeltaPHI', 'Std_5_PHIND', 'Std_5_PE', 'Std_5_NM_M', 'Std_5_RELPOS', 'Var_5_Depth', 'Var_5_GR', 'Var_5_ILD_log10', 'Var_5_DeltaPHI', 'Var_5_PHIND', 'Var_5_PE', 'Var_5_NM_M', 'Var_5_RELPOS', 'Diff_5_Depth', 'Diff_5_GR', 'Diff_5_ILD_log10', 'Diff_5_DeltaPHI', 'Diff_5_PHIND', 'Diff_5_PE', 'Diff_5_NM_M', 'Diff_5_RELPOS', 'Rdiff_5_Depth', 'Rdiff_5_GR', 'Rdiff_5_ILD_log10', 'Rdiff_5_DeltaPHI', 'Rdiff_5_PHIND', 'Rdiff_5_PE', 'Rdiff_5_NM_M', 'Rdiff_5_RELPOS', 'Mean_10_Depth', 'Mean_10_GR', 'Mean_10_ILD_log10', 'Mean_10_DeltaPHI', 'Mean_10_PHIND', 'Mean_10_PE', 'Mean_10_NM_M', 'Mean_10_RELPOS', 'Max_10_Depth', 'Max_10_GR', 'Max_10_ILD_log10', 'Max_10_DeltaPHI', 'Max_10_PHIND', 'Max_10_PE', 'Max_10_NM_M', 'Max_10_RELPOS', 'Min_10_Depth', 'Min_10_GR', 'Min_10_ILD_log10', 'Min_10_DeltaPHI', 'Min_10_PHIND', 'Min_10_PE', 'Min_10_NM_M', 'Min_10_RELPOS', 'Std_10_Depth', 'Std_10_GR', 'Std_10_ILD_log10', 'Std_10_DeltaPHI', 'Std_10_PHIND', 'Std_10_PE', 'Std_10_NM_M', 'Std_10_RELPOS', 'Var_10_Depth', 'Var_10_GR', 'Var_10_ILD_log10', 'Var_10_DeltaPHI', 'Var_10_PHIND', 'Var_10_PE', 'Var_10_NM_M', 'Var_10_RELPOS', 'Diff_10_Depth', 'Diff_10_GR', 'Diff_10_ILD_log10', 'Diff_10_DeltaPHI', 'Diff_10_PHIND', 'Diff_10_PE', 'Diff_10_NM_M', 'Diff_10_RELPOS', 'Rdiff_10_Depth', 'Rdiff_10_GR', 'Rdiff_10_ILD_log10', 'Rdiff_10_DeltaPHI', 'Rdiff_10_PHIND', 'Rdiff_10_PE', 'Rdiff_10_NM_M', 'Rdiff_10_RELPOS', 'Mean_25_Depth', 'Mean_25_GR', 'Mean_25_ILD_log10', 'Mean_25_DeltaPHI', 'Mean_25_PHIND', 'Mean_25_PE', 'Mean_25_NM_M', 'Mean_25_RELPOS', 'Max_25_Depth', 'Max_25_GR', 'Max_25_ILD_log10', 'Max_25_DeltaPHI', 'Max_25_PHIND', 'Max_25_PE', 'Max_25_NM_M', 'Max_25_RELPOS', 'Min_25_Depth', 'Min_25_GR', 'Min_25_ILD_log10', 'Min_25_DeltaPHI', 'Min_25_PHIND', 'Min_25_PE', 'Min_25_NM_M', 'Min_25_RELPOS', 'Std_25_Depth', 'Std_25_GR', 'Std_25_ILD_log10', 'Std_25_DeltaPHI', 'Std_25_PHIND', 'Std_25_PE', 'Std_25_NM_M', 'Std_25_RELPOS', 'Var_25_Depth', 'Var_25_GR', 'Var_25_ILD_log10', 'Var_25_DeltaPHI', 'Var_25_PHIND', 'Var_25_PE', 'Var_25_NM_M', 'Var_25_RELPOS', 'Diff_25_Depth', 'Diff_25_GR', 'Diff_25_ILD_log10', 'Diff_25_DeltaPHI', 'Diff_25_PHIND', 'Diff_25_PE', 'Diff_25_NM_M', 'Diff_25_RELPOS', 'Rdiff_25_Depth', 'Rdiff_25_GR', 'Rdiff_25_ILD_log10', 'Rdiff_25_DeltaPHI', 'Rdiff_25_PHIND', 'Rdiff_25_PE', 'Rdiff_25_NM_M', 'Rdiff_25_RELPOS', 'Mean_50_Depth', 'Mean_50_GR', 'Mean_50_ILD_log10', 'Mean_50_DeltaPHI', 'Mean_50_PHIND', 'Mean_50_PE', 'Mean_50_NM_M', 'Mean_50_RELPOS', 'Max_50_Depth', 'Max_50_GR', 'Max_50_ILD_log10', 'Max_50_DeltaPHI', 'Max_50_PHIND', 'Max_50_PE', 'Max_50_NM_M', 'Max_50_RELPOS', 'Min_50_Depth', 'Min_50_GR', 'Min_50_ILD_log10', 'Min_50_DeltaPHI', 'Min_50_PHIND', 'Min_50_PE', 'Min_50_NM_M', 'Min_50_RELPOS', 'Std_50_Depth', 'Std_50_GR', 'Std_50_ILD_log10', 'Std_50_DeltaPHI', 'Std_50_PHIND', 'Std_50_PE', 'Std_50_NM_M', 'Std_50_RELPOS', 'Var_50_Depth', 'Var_50_GR', 'Var_50_ILD_log10', 'Var_50_DeltaPHI', 'Var_50_PHIND', 'Var_50_PE', 'Var_50_NM_M', 'Var_50_RELPOS', 'Diff_50_Depth', 'Diff_50_GR', 'Diff_50_ILD_log10', 'Diff_50_DeltaPHI', 'Diff_50_PHIND', 'Diff_50_PE', 'Diff_50_NM_M', 'Diff_50_RELPOS', 'Rdiff_50_Depth', 'Rdiff_50_GR', 'Rdiff_50_ILD_log10', 'Rdiff_50_DeltaPHI', 'Rdiff_50_PHIND', 'Rdiff_50_PE', 'Rdiff_50_NM_M', 'Rdiff_50_RELPOS', 'Well Name', 'orig_index', 'above_Depth', 'above_GR', 'above_ILD_log10', 'above_DeltaPHI', 'above_PHIND', 'above_PE', 'above_NM_M', 'above_RELPOS', 'below_Depth', 'below_GR', 'below_ILD_log10', 'below_DeltaPHI', 'below_PHIND', 'below_PE', 'below_NM_M', 'below_RELPOS', 'Depth_GR', 'Depth_ILD_log10', 'Depth_DeltaPHI', 'Depth_PHIND', 'Depth_PE', 'Depth_NM_M', 'Depth_RELPOS', 'GR_Depth', 'GR_ILD_log10', 'GR_DeltaPHI', 'GR_PHIND', 'GR_PE', 'GR_NM_M', 'GR_RELPOS', 'ILD_log10_Depth', 'ILD_log10_GR', 'ILD_log10_DeltaPHI', 'ILD_log10_PHIND', 'ILD_log10_PE', 'ILD_log10_NM_M', 'ILD_log10_RELPOS', 'DeltaPHI_Depth', 'DeltaPHI_GR', 'DeltaPHI_ILD_log10', 'DeltaPHI_PHIND', 'DeltaPHI_PE', 'DeltaPHI_NM_M', 'DeltaPHI_RELPOS', 'PHIND_Depth', 'PHIND_GR', 'PHIND_ILD_log10', 'PHIND_DeltaPHI', 'PHIND_PE', 'PHIND_NM_M', 'PHIND_RELPOS', 'PE_Depth', 'PE_GR', 'PE_ILD_log10', 'PE_DeltaPHI', 'PE_PHIND', 'PE_NM_M', 'PE_RELPOS', 'NM_M_Depth', 'NM_M_GR', 'NM_M_ILD_log10', 'NM_M_DeltaPHI', 'NM_M_PHIND', 'NM_M_PE', 'NM_M_RELPOS', 'RELPOS_Depth', 'RELPOS_GR', 'RELPOS_ILD_log10', 'RELPOS_DeltaPHI', 'RELPOS_PHIND', 'RELPOS_PE', 'RELPOS_NM_M']

In [15]:
train_f, test_f = train_test_split(upsampled_frame_std, test_size = 0.2, 
                                   random_state = 72)

In [16]:
# --------------------------
# TPOT Generated Model
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier

exported_pipeline = make_pipeline(
    make_union(VotingClassifier([("est", ExtraTreesClassifier(criterion="entropy", max_features=0.36, n_estimators=500))]), FunctionTransformer(lambda X: X)),
    DecisionTreeClassifier()
)

exported_pipeline.fit(train_f[features], train_f['class'])


Out[16]:
Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('votingclassifier', VotingClassifier(estimators=[('est', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.36, max_leaf_nodes=None,
           min_impurity_s...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [17]:
exported_pipeline.score(test_f[features], test_f['class'])


Out[17]:
0.87789799072642971

In [21]:
result = exported_pipeline.predict(test_f[features])

from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm

conf = confusion_matrix(test_f['class'], result)
display_cm(conf, facies_labels, hide_zeros = True, display_metrics = True)

def accuracy(conf):
    total_correct = 0
    nb_classes = conf.shape[0]
    for i in np.arange(0, nb_classes):
        total_correct += conf[i][i]
    acc = total_correct / sum(sum(conf))
    return acc

print (accuracy(conf))

adjacent_facies = np.array([[1], [0, 2], [1], [4], [3, 5], [4, 6, 7], [5, 7], [5, 6, 8], [6, 7]])

def accuracy_adjacent(conf, adjacent_facies):
    nb_classes = conf.shape[0]
    total_correct = 0
    for i in np.arange(0, nb_classes):
        total_correct += conf[i][i]
        for j in adjacent_facies[i]:
            total_correct += conf[i][j]
    return total_correct / sum(sum(conf))

print(accuracy_adjacent(conf, adjacent_facies))


     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS    46     6     1                                        53
     CSiS     2   132    10                             1         145
     FSiS           4   115                                       119
     SiSh     1     1          31     1     3           2          39
       MS                       3    29     7           5          44
       WS                       3     5    77           9     1    95
        D                       1           1    16                18
       PS                 2           1     6     1    96     1   107
       BS                                   1                26    27

Precision  0.94  0.92  0.90  0.82  0.81  0.81  0.94  0.85  0.93  0.88
   Recall  0.87  0.91  0.97  0.79  0.66  0.81  0.89  0.90  0.96  0.88
       F1  0.90  0.92  0.93  0.81  0.73  0.81  0.91  0.87  0.95  0.88
0.877897990726
0.964451313756

Now load and process the test data set, then predict using the 'exported_pipeline' model.


In [19]:
test_path = '../validation_data_nofacies.csv'

# Read training data to dataframe
test = pd.read_csv(test_path)

# Set string features to integers

for i, value in enumerate(test['Formation'].unique()):
    test.loc[test['Formation'] == value, 'Formation'] = i
    
for i, value in enumerate(test['Well Name'].unique()):
    test.loc[test['Well Name'] == value, 'Well Name'] = i

# The first thing that will be done is to upsample and interpolate the training data,
# the objective here is to provide significantly more samples to train the regressor on and
# also to capture more of the sample interdependancy.
upsampled_arrays = []
test['orig_index'] = test.index

# Use rolling windows through upsampled frame, grouping by well name.

# Empty list to hold frames
mean_frames = []
above = []
below = []

for well, group in test.groupby('Well Name'):
    # Empty list to hold rolling frames
    constructor_list = []
    for f in resample_factors:
        
        working_frame = group[['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M',
       'RELPOS']]
        
        mean_frame = working_frame.rolling(window = f, center = True).mean().interpolate(method = 'index', limit_direction = 'both', limit = None)
        mean_frame.columns = ['Mean_{0}_{1}'.format(f, column) for column in mean_frame.columns]
        max_frame = working_frame.rolling(window = f, center = True).max().interpolate(method = 'index', limit_direction = 'both', limit = None)
        max_frame.columns = ['Max_{0}_{1}'.format(f, column) for column in max_frame.columns]
        min_frame = working_frame.rolling(window = f, center = True).min().interpolate(method = 'index', limit_direction = 'both', limit = None)
        min_frame.columns = ['Min_{0}_{1}'.format(f, column) for column in min_frame.columns]
        std_frame = working_frame.rolling(window = f, center = True).std().interpolate(method = 'index', limit_direction = 'both', limit = None)
        std_frame.columns = ['Std_{0}_{1}'.format(f, column) for column in std_frame.columns]
        var_frame = working_frame.rolling(window = f, center = True).var().interpolate(method = 'index', limit_direction = 'both', limit = None)
        var_frame.columns = ['Var_{0}_{1}'.format(f, column) for column in var_frame.columns]
        diff_frame = working_frame.diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None)
        diff_frame.columns = ['Diff_{0}_{1}'.format(f, column) for column in diff_frame.columns]
        rdiff_frame = working_frame.sort_index(ascending = False).diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None).sort_index()
        rdiff_frame.columns = ['Rdiff_{0}_{1}'.format(f, column) for column in rdiff_frame.columns]
        skew_frame = working_frame.rolling(window = f, center = True).skew().interpolate(method = 'index', limit_direction = 'both', limit = None)
        skew_frame.columns = ['Skew_{0}_{1}'.format(f, column) for column in skew_frame.columns]
        
        f_frame = pd.concat((mean_frame, max_frame, min_frame, std_frame, var_frame, diff_frame, rdiff_frame), axis = 1)
        
        constructor_list.append(f_frame)
        
    well_frame = pd.concat(constructor_list, axis = 1)
    well_frame['Well Name'] = well
    # orig index is holding the original index locations, to make extracting the results trivial
    well_frame['orig_index'] = group['orig_index']
    df = group.sort_values('Depth')
    u = df.shift(-1).fillna(method = 'ffill')
    b = df.shift(1).fillna(method = 'bfill')
    above.append(u[div_columns])
    below.append(b[div_columns])
    
    mean_frames.append(well_frame.fillna(method = 'bfill').fillna(method = 'ffill'))
    
frame = test
frame.index = frame['orig_index']
frame.drop(['orig_index', 'Well Name'], axis = 1, inplace = True)

for f in mean_frames:
    f.index = f['orig_index']

rolling_frame = pd.concat(mean_frames, axis = 0)
above_frame = pd.concat(above)
above_frame.columns = ['above_'+ column for column in above_frame.columns]
below_frame = pd.concat(below)
below_frame.columns = ['below_'+ column for column in below_frame.columns]
upsampled_frame = pd.concat((frame, rolling_frame, above_frame, below_frame), axis = 1)

features = [feature for feature in upsampled_frame.columns if 'class' not in feature]

std_scaler = preprocessing.StandardScaler().fit(upsampled_frame[features])
train_std = std_scaler.transform(upsampled_frame[features])

train_std_frame = upsampled_frame
for i, column in enumerate(features):
    train_std_frame.loc[:, column] = train_std[:, i]

upsampled_frame_std = train_std_frame

for feature in div_columns:
    for f in div_columns:
        if f == feature:
            continue
        upsampled_frame['{0}_{1}'.format(feature, f)] = upsampled_frame[f] / upsampled_frame[feature]
        
features = [feature for feature in upsampled_frame.columns if 'class' not in feature]

In [20]:
# Predict result on full sample set
result = exported_pipeline.predict(upsampled_frame[features])
# Add result to test set
upsampled_frame['Facies'] = result
# Output to csv
upsampled_frame.to_csv('05 - Well Facies Prediction - Test Data Set.csv')

In [ ]: